Expand code block to see libraries and functions used.

library(dplyr)
library(data.table)
library(stringr)
library(randomForest)
library(lme4)
library(gbm)
library(dismo)
library(ROCR)
library(PresenceAbsence) # for Kappa and PCC
library(ggplot2)

get.pred <- function(modobj, modtype="GLM"){
  if(modtype=="RF"){
    p_rf.mod <- predict(modobj, type = "prob")
    pred.mod <- prediction(p_rf.mod[,2], modobj$y)
  } else if(modtype=="BRT") {
    p_brt.mod <- predict(modobj, type = "response")
    pred.mod <- prediction(p_brt.mod, modobj$data$y)
  } else {
    pred.mod <- prediction(modobj@resp$mu, modobj@resp$y)
  }
  return(pred.mod)
}

ConfMatrix <- function(predRDS, thresh, column=1){
  #Presence.Absence insists on its own data structure
  padat <- data.frame(ID=seq(1:length(predRDS@predictions[[1]])),
                      OBS=as.numeric(levels(predRDS@labels[[column]]))[predRDS@labels[[column]]],
                      PRED=predRDS@predictions[[column]])

  pacmx <- cmx(padat, threshold = thresh)

  return(pacmx)
}

#https://github.com/brendano/dlanalysis/blob/master/util.R
linelight <- function(x,y, lty='dashed', col='lightgray', ...) {
  # highlight a point with lines running to the axes.
  left = par('usr')[1]
  bot = par('usr')[3]
  segments(left,y, x,y, lty=lty, col=col, ...)
  segments(x,bot,  x,y, lty=lty, col=col, ...)
}

Model Evaluation and Review

Model Rasters

These maps are displayed using a linear min to max stretch, green (high) to white (low) color ramp.

GLM

GLMER

GLMER

RF

RF

RF

BRT

BRT

BRT

Ensemble

Ensemble

Ensemble

ROC and Other Plots

These plots are for performance against the 15% Test data

Gray solid lines indicate random performance.
Dashed lines shows the the values the axes measure for the Ensemble at the Sensitivity = 0.95 threshold (0.3352343).

trng <- c(as.numeric(testOut[4,9])-0.000001,
          as.numeric(testOut[4,9])+0.000001)
i <- which((perf.roc@alpha.values[[4]] > trng[1]) &
             (perf.roc@alpha.values[[4]] < trng[2]))

# 1) Red, 2) Green, 3) Cyan, 4) Purple
plot(perf.roc, col=as.list(rainbow(4)),
     main="ROC for the 4 models",
     xlim=c(0,1))
linelight(x=perf.roc@x.values[[4]][i], y=perf.roc@y.values[[4]][i],
          col="plum3")
abline(a = 0, b = 1, col = "gray")
legend("right", c("GLM", "RF", "BRT", "EN"), lty = "solid",
       col = rainbow(4), bty = "n")

plot(perf.err, col=as.list(rainbow(4)),
     main="Error Rate vs. Threshold Cutoff",
     xlim=c(0,1), ylim=c(0,1))
linelight(x=as.numeric(testOut[4,9]), 
          y=as.numeric(testOut[4,4]), col="plum3")
lines(x=c(0,1), y=c(0.5,0.5), col="gray")
legend("top", c("GLM", "RF", "BRT", "EN"), lty = "solid",
       col = rainbow(4), bty = "n")

i <- which((perf.flip@alpha.values[[4]] > trng[1]) &
             (perf.flip@alpha.values[[4]] < trng[2]))

plot(perf.flip, col=as.list(rainbow(4)),
     main="Flip-side of the ROC")
abline(a = 0, b = 1, col = "gray")
linelight(x=perf.flip@x.values[[4]][i],
          y=perf.flip@y.values[[4]][i],
          col="plum3")
legend("right", c("GLM", "RF", "BRT", "EN"), lty = "solid",
       col = rainbow(4), bty = "n")

i <- which((perf.PA@alpha.values[[4]] > trng[1]) &
             (perf.PA@alpha.values[[4]] < trng[2]))

plot(perf.PA, col=as.list(rainbow(4)),
     main="Precision vs. Accuracy",
     xlim=c(0.5,1), ylim=c(0.5,1))
linelight(x=perf.PA@x.values[[4]][i],
          y=perf.PA@y.values[[4]][i],
          col="plum3")
legend("left", c("GLM", "RF", "BRT", "EN"), lty = "solid",
       col = rainbow(4), bty = "n")

Accuracy: \(P(Yhat = Y)\). Estimated as: \((TP+TN)/(P+N)\)
Precision: Positive predictive value. \(P(Y = + | Yhat = +)\). Estimated as: \(TP/(TP+FP)\)

Confusion Matrix

These are calculated at Sensitivity = 0.95 and from the Testing data (of which there are 20,900 records).

…and we want them to look like this Example

cmxG <- ConfMatrix(predAll, 
                   thresh = as.numeric(testOut[1,9]), 
                   column = 1)

#Adapted from https://stackoverflow.com/questions/37897252/plot-confusion-matrix-in-r-using-ggplot
ggplot(data=as.data.frame(cmxG),
       aes(x=observed, y=predicted)) +
  geom_tile(aes(fill=Freq), color="white") +
  geom_text(aes(label=Freq), vjust=1) +
  scale_fill_gradientn(breaks=c(500,2500,5000,10000),
                      colors = hcl.colors(4, "Heat")) +
  labs(title = "GLM Confusion Matrix") +
  theme_bw() + theme(legend.position = "none")

cmxR <- ConfMatrix(predAll, 
                   thresh = as.numeric(testOut[2,9]), 
                   column = 2)

ggplot(data=as.data.frame(cmxR),
       aes(x=observed, y=predicted)) +
  geom_tile(aes(fill=Freq), color="white") +
  geom_text(aes(label=Freq), vjust=1) +
  scale_fill_gradientn(breaks=c(500,2500,5000,10000),
                      colors = hcl.colors(4, "Heat")) +
  labs(title = "RF Confusion Matrix") +
  theme_bw() + theme(legend.position = "none") 

cmxB <- ConfMatrix(predAll, 
                   thresh = as.numeric(testOut[3,9]), 
                   column = 3)

ggplot(data=as.data.frame(cmxB),
       aes(x=observed, y=predicted)) +
  geom_tile(aes(fill=Freq), color="white") +
  geom_text(aes(label=Freq), vjust=1) +
  scale_fill_gradientn(breaks=c(500,2500,5000,10000),
                      colors = hcl.colors(4, "Heat")) +
  labs(title = "BRT Confusion Matrix") +
  theme_bw() + theme(legend.position = "none")

cmxE <- ConfMatrix(predAll,
                   thresh = as.numeric(testOut[4,9]),
                   column = 4)

ggplot(data=as.data.frame(cmxE),
       aes(x=observed, y=predicted)) +
  geom_tile(aes(fill=Freq), color="white") +
  geom_text(aes(label=Freq), vjust=1) +
  scale_fill_gradientn(breaks=c(500,2500,5000,10000),
                      colors = hcl.colors(4, "Heat")) +
  labs(title = "Ensemble Confusion Matrix") +
  theme_bw() + theme(legend.position = "none")

Variable Importance

The values have all been normalized so that the sum of all variable importance measures for a model = 1. These are from the Training data.

RF response

Meh… I cannot figure out any way to get meaningful plots out of a randomForest::randomForest object.

GLM Grid_ID as random effect:

## $Grid_ID

The dots are the mean residuals and the black lines are, I think, the standard error of the residuals for each Grid_ID.
I honestly have no idea what to make of this graph, except that the data isn’t normally distributed…